In [1]:
import pymysql
db = pymysql.connect(
"db.fastcamp.us",
"root",
"dkstncks",
"sakila",
charset='utf8',
)
In [2]:
customer_df = pd.read_sql("SELECT * FROM customer;", db)
payment_df = pd.read_sql("SELECT * FROM payment;", db)
In [3]:
customer_df.head(1)
Out[3]:
customer_id
store_id
first_name
last_name
email
address_id
active
create_date
last_update
0
1
1
MARY
SMITH
MARY.SMITH@sakilacustomer.org
5
1
2006-02-14 22:04:36
2006-02-15 04:57:20
In [4]:
payment_df.head(1)
Out[4]:
payment_id
customer_id
staff_id
rental_id
amount
payment_date
last_update
0
1
1
1
76.0
2.99
2005-05-25 11:30:37
2006-02-15 22:12:30
In [ ]:
In [5]:
SQL_QUERY = """
SELECT c.first_name, c.last_name, SUM(p.amount) "Revenue"
FROM
customer c
JOIN payment p
ON p.customer_id = c.customer_id
GROUP BY c.customer_id
ORDER BY Revenue DESC
;
"""
pd.read_sql(SQL_QUERY, db)
Out[5]:
first_name
last_name
Revenue
0
KARL
SEAL
221.55
1
ELEANOR
HUNT
216.54
2
CLARA
SHAW
195.58
3
MARION
SNYDER
194.61
4
RHONDA
KENNEDY
194.61
5
TOMMY
COLLAZO
186.62
6
WESLEY
BULL
177.60
7
TIM
CARY
175.61
8
MARCIA
DEAN
175.58
9
ANA
BRADLEY
174.66
10
JUNE
CARROLL
173.63
11
LENA
JENSEN
170.67
12
DIANE
COLLINS
169.65
13
ARNOLD
HAVENS
167.67
14
CURTIS
IRBY
167.62
15
MIKE
WAY
166.65
16
DAISY
BATES
162.62
17
TONYA
CHAPMAN
161.68
18
LOUIS
LEONE
161.65
19
GORDON
ALLARD
160.68
20
BRITTANY
RILEY
159.72
21
GUY
BROWNLEE
159.68
22
WARREN
SHERROD
159.67
23
MARGIE
WADE
159.64
24
MAE
FLETCHER
158.69
25
STEVE
MACKENZIE
158.66
26
HARRY
ARCE
157.65
27
PRISCILLA
LOWE
157.65
28
ARTHUR
SIMPKINS
155.68
29
MICHELLE
CLARK
155.65
...
...
...
...
569
JO
FOWLER
73.80
570
JUAN
FRALEY
73.77
571
FELIX
GAFFNEY
73.76
572
JONATHAN
SCARBOROUGH
72.82
573
TERRY
GRISSOM
72.80
574
EUGENE
CULPEPPER
71.81
575
EDITH
MCDONALD
71.81
576
ANTHONY
SCHWAB
71.80
577
LAUREN
HUDSON
71.80
578
SAMANTHA
DUNCAN
71.77
579
IRMA
PEARSON
70.82
580
WAYNE
TRUONG
70.81
581
FLOYD
GANDY
69.83
582
PENNY
NEAL
68.82
583
JILL
HAWKINS
68.79
584
VERA
MCCOY
67.82
585
JOANN
GARDNER
66.84
586
LLOYD
DOWD
66.81
587
ALBERTO
HENNING
66.79
588
LESTER
KRAUS
65.84
589
KIRK
STCLAIR
64.81
590
MATTIE
HOFFMAN
64.78
591
ANITA
MORALES
62.85
592
TIFFANY
JORDAN
59.86
593
KATHERINE
RIVERA
58.86
594
ANNIE
RUSSELL
58.82
595
JOHNNY
TURPIN
57.81
596
BRIAN
WYMAN
52.88
597
LEONA
OBRIEN
50.86
598
CAROLINE
BOWMAN
50.85
599 rows × 3 columns
In [6]:
SQL_QUERY = """
SELECT
c.customer_id,
SUM(p.amount)
FROM payment p, customer c
WHERE p.customer_id = c.customer_id
GROUP BY c.customer_id
;
"""
pd.read_sql(SQL_QUERY, db)
Out[6]:
customer_id
SUM(p.amount)
0
1
118.68
1
2
128.73
2
3
135.74
3
4
81.78
4
5
144.62
5
6
93.72
6
7
151.67
7
8
92.76
8
9
89.77
9
10
99.75
10
11
106.76
11
12
103.72
12
13
131.73
13
14
117.72
14
15
134.68
15
16
120.71
16
17
98.79
17
18
91.78
18
19
125.76
19
20
115.70
20
21
155.65
21
22
113.78
22
23
119.70
23
24
95.75
24
25
115.71
25
26
152.66
26
27
126.69
27
28
111.68
28
29
140.64
29
30
123.66
...
...
...
569
570
99.74
570
571
121.76
571
572
108.75
572
573
120.71
573
574
109.72
574
575
126.71
575
576
139.66
576
577
118.72
577
578
96.78
578
579
111.73
579
580
99.73
580
581
107.73
581
582
113.75
582
583
117.77
583
584
129.70
584
585
117.76
585
586
64.81
586
587
108.74
587
588
115.71
588
589
129.72
589
590
112.75
590
591
134.73
591
592
111.71
592
593
113.74
593
594
130.73
594
595
117.70
595
596
96.72
596
597
99.75
597
598
83.78
598
599
83.81
599 rows × 2 columns
In [13]:
payment_df.groupby("customer_id").agg({"amount": np.sum})
Out[13]:
amount
customer_id
1
118.68
2
128.73
3
135.74
4
81.78
5
144.62
6
93.72
7
151.67
8
92.76
9
89.77
10
99.75
11
106.76
12
103.72
13
131.73
14
117.72
15
134.68
16
120.71
17
98.79
18
91.78
19
125.76
20
115.70
21
155.65
22
113.78
23
119.70
24
95.75
25
115.71
26
152.66
27
126.69
28
111.68
29
140.64
30
123.66
...
...
570
99.74
571
121.76
572
108.75
573
120.71
574
109.72
575
126.71
576
139.66
577
118.72
578
96.78
579
111.73
580
99.73
581
107.73
582
113.75
583
117.77
584
129.70
585
117.76
586
64.81
587
108.74
588
115.71
589
129.72
590
112.75
591
134.73
592
111.71
593
113.74
594
130.73
595
117.70
596
96.72
597
99.75
598
83.78
599
83.81
599 rows × 1 columns
In [7]:
rental_df = pd.read_sql("SELECT * FROM rental;", db)
In [8]:
rental_df.head(1)
Out[8]:
rental_id
rental_date
inventory_id
customer_id
return_date
staff_id
last_update
0
1
2005-05-24 22:53:30
367
130
2005-05-26 22:04:30
1
2006-02-15 21:30:53
In [9]:
customer_df.head(1)
Out[9]:
customer_id
store_id
first_name
last_name
email
address_id
active
create_date
last_update
0
1
1
MARY
SMITH
MARY.SMITH@sakilacustomer.org
5
1
2006-02-14 22:04:36
2006-02-15 04:57:20
In [10]:
SQL_QUERY = """
SELECT
c.first_name,
c.last_name,
COUNT(*) "rentals_per_customer"
FROM
rental r
JOIN customer c
ON r.customer_id = c.customer_id
GROUP BY c.customer_id
HAVING rentals_per_customer >=30
ORDER BY 3 DESC
;
"""
pd.read_sql(SQL_QUERY, db)
Out[10]:
first_name
last_name
rentals_per_customer
0
ELEANOR
HUNT
46
1
KARL
SEAL
45
2
MARCIA
DEAN
42
3
CLARA
SHAW
42
4
TAMMY
SANDERS
41
5
SUE
PETERS
40
6
WESLEY
BULL
40
7
TIM
CARY
39
8
RHONDA
KENNEDY
39
9
MARION
SNYDER
39
10
ELIZABETH
BROWN
38
11
DAISY
BATES
38
12
CURTIS
IRBY
38
13
TOMMY
COLLAZO
38
14
BRANDON
HUEY
37
15
ELSIE
KELLEY
37
16
JUNE
CARROLL
37
17
MARSHA
DOUGLAS
37
18
ROGER
QUINTANILLA
36
19
RUSSELL
BRINSON
36
20
ANGELA
HERNANDEZ
36
21
ALEXANDER
FENNELL
36
22
JUSTIN
NGO
36
23
MARGIE
WADE
36
24
ALMA
AUSTIN
35
25
ROSEMARY
SCHMIDT
35
26
LOUIS
LEONE
35
27
BOBBY
BOUDREAU
35
28
MICHELLE
CLARK
35
29
HARRY
ARCE
35
...
...
...
...
148
RONALD
WEINER
30
149
JUANITA
MASON
30
150
CLARENCE
GAMEZ
30
151
MABEL
HOLLAND
30
152
GLORIA
COOK
30
153
RAMONA
HALE
30
154
THERESA
WATSON
30
155
MARILYN
ROSS
30
156
JOSHUA
MARK
30
157
ROSA
REYNOLDS
30
158
MANUEL
MURRELL
30
159
KYLE
SPURLOCK
30
160
JORDAN
ARCHULETA
30
161
SALVADOR
TEEL
30
162
GREG
ROBINS
30
163
VIOLET
RODRIQUEZ
30
164
GERALDINE
PERKINS
30
165
CAROLYN
PEREZ
30
166
MICHELE
GRANT
30
167
JEFF
EAST
30
168
CAROLE
BARNETT
30
169
JAMES
GANNON
30
170
JOSEPH
JOY
30
171
FLORENCE
WOODS
30
172
WALTER
PERRYMAN
30
173
WENDY
HARRISON
30
174
RYAN
SALISBURY
30
175
MONICA
HICKS
30
176
MARC
OUTLAW
30
177
TERRENCE
GUNDERSON
30
178 rows × 3 columns
In [11]:
SQL_QUERY = """
SELECT
c.first_name,
c.last_name,
COUNT(*) "rentals_per_customer"
FROM
rental r,
customer c
WHERE
r.customer_id = c.customer_id
GROUP BY c.customer_id
HAVING rentals_per_customer >= 30
;
"""
pd.read_sql(SQL_QUERY, db)
Out[11]:
first_name
last_name
rentals_per_customer
0
MARY
SMITH
32
1
ELIZABETH
BROWN
38
2
MARIA
MILLER
33
3
HELEN
HARRIS
32
4
SHARON
ROBINSON
30
5
MICHELLE
CLARK
35
6
SARAH
LEWIS
30
7
JESSICA
HALL
34
8
SHIRLEY
ALLEN
31
9
CYNTHIA
YOUNG
32
10
ANGELA
HERNANDEZ
36
11
MELISSA
KING
34
12
VIRGINIA
GREEN
32
13
MARTHA
GONZALEZ
34
14
CAROLYN
PEREZ
30
15
CATHERINE
CAMPBELL
34
16
DIANE
COLLINS
35
17
ALICE
STEWART
33
18
HEATHER
MORRIS
30
19
GLORIA
COOK
30
20
JUDITH
COX
33
21
JANICE
WARD
34
22
KATHY
JAMES
30
23
THERESA
WATSON
30
24
TAMMY
SANDERS
41
25
LORI
WOOD
31
26
MARILYN
ROSS
30
27
SARA
PERRY
33
28
JACQUELINE
LONG
33
29
WANDA
PATTERSON
30
...
...
...
...
148
TOM
MILNER
32
149
ALEX
GRESHAM
33
150
TOMMY
COLLAZO
38
151
WARREN
SHERROD
33
152
TIM
CARY
39
153
WESLEY
BULL
40
154
GORDON
ALLARD
32
155
GREG
ROBINS
30
156
JORGE
OLIVARES
34
157
ZACHARY
HITE
31
158
ROBERTO
VU
30
159
RAMON
CHOATE
31
160
MARC
OUTLAW
30
161
BRETT
CORNWELL
34
162
ANGEL
BARCLAY
32
163
LESLIE
SEWARD
35
164
DUANE
TUBBS
31
165
MITCHELL
WESTMORELAND
32
166
ARNOLD
HAVENS
33
167
KARL
SEAL
45
168
NEIL
RENNER
32
169
JESSIE
MILAM
33
170
JAVIER
ELROD
32
171
GUY
BROWNLEE
32
172
JORDAN
ARCHULETA
30
173
CASEY
MENA
34
174
DAVE
GARDINER
32
175
MORRIS
MCCARTER
34
176
SALVADOR
TEEL
30
177
TERRENCE
GUNDERSON
30
178 rows × 3 columns
In [ ]:
In [20]:
RENTALS_PER_CUSTOMER_SQL_QUERY = """
SELECT
c.first_name,
c.last_name,
COUNT(*) "rentals_per_customer"
FROM
rental r
JOIN customer c
ON r.customer_id = c.customer_id
GROUP BY c.customer_id
;
"""
SQL_QUERY = """
SELECT *
FROM ({RENTALS_PER_CUSTOMER_SQL_QUERY}) as rpc
WHERE rentals_per_customer >= 30
;
""".format(RENTALS_PER_CUSTOMER_SQL_QUERY=RENTALS_PER_CUSTOMER_SQL_QUERY.replace(";", ""))
# print(SQL_QUERY)
pd.read_sql(SQL_QUERY, db)
Out[20]:
first_name
last_name
rentals_per_customer
0
MARY
SMITH
32
1
ELIZABETH
BROWN
38
2
MARIA
MILLER
33
3
HELEN
HARRIS
32
4
SHARON
ROBINSON
30
5
MICHELLE
CLARK
35
6
SARAH
LEWIS
30
7
JESSICA
HALL
34
8
SHIRLEY
ALLEN
31
9
CYNTHIA
YOUNG
32
10
ANGELA
HERNANDEZ
36
11
MELISSA
KING
34
12
VIRGINIA
GREEN
32
13
MARTHA
GONZALEZ
34
14
CAROLYN
PEREZ
30
15
CATHERINE
CAMPBELL
34
16
DIANE
COLLINS
35
17
ALICE
STEWART
33
18
HEATHER
MORRIS
30
19
GLORIA
COOK
30
20
JUDITH
COX
33
21
JANICE
WARD
34
22
KATHY
JAMES
30
23
THERESA
WATSON
30
24
TAMMY
SANDERS
41
25
LORI
WOOD
31
26
MARILYN
ROSS
30
27
SARA
PERRY
33
28
JACQUELINE
LONG
33
29
WANDA
PATTERSON
30
...
...
...
...
148
TOM
MILNER
32
149
ALEX
GRESHAM
33
150
TOMMY
COLLAZO
38
151
WARREN
SHERROD
33
152
TIM
CARY
39
153
WESLEY
BULL
40
154
GORDON
ALLARD
32
155
GREG
ROBINS
30
156
JORGE
OLIVARES
34
157
ZACHARY
HITE
31
158
ROBERTO
VU
30
159
RAMON
CHOATE
31
160
MARC
OUTLAW
30
161
BRETT
CORNWELL
34
162
ANGEL
BARCLAY
32
163
LESLIE
SEWARD
35
164
DUANE
TUBBS
31
165
MITCHELL
WESTMORELAND
32
166
ARNOLD
HAVENS
33
167
KARL
SEAL
45
168
NEIL
RENNER
32
169
JESSIE
MILAM
33
170
JAVIER
ELROD
32
171
GUY
BROWNLEE
32
172
JORDAN
ARCHULETA
30
173
CASEY
MENA
34
174
DAVE
GARDINER
32
175
MORRIS
MCCARTER
34
176
SALVADOR
TEEL
30
177
TERRENCE
GUNDERSON
30
178 rows × 3 columns
In [21]:
rc_df = rental_df.merge(customer_df, on="customer_id")
In [22]:
rc_df.groupby("customer_id").size() >= 30
Out[22]:
customer_id
1 True
2 False
3 False
4 False
5 True
6 False
7 True
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 True
16 False
17 False
18 False
19 False
20 True
21 True
22 False
23 True
24 False
25 False
26 True
27 True
28 True
29 True
30 True
...
570 False
571 False
572 False
573 False
574 False
575 False
576 True
577 False
578 False
579 False
580 False
581 False
582 False
583 False
584 True
585 False
586 False
587 False
588 False
589 False
590 False
591 False
592 False
593 False
594 False
595 True
596 False
597 False
598 False
599 False
dtype: bool
In [27]:
rentals_per_customer_df = rc_df.groupby("customer_id").agg({"customer_id": np.size})
In [25]:
is_30 = rentals_per_customer_df.customer_id > 30
rentals_per_customer_df[is_30]
Out[25]:
customer_id
customer_id
1
32
5
38
7
33
15
32
21
35
26
34
27
31
28
32
29
36
30
34
35
32
38
34
46
34
50
35
51
33
64
33
66
34
75
41
78
31
84
33
86
33
89
32
91
35
102
33
103
31
114
33
119
34
120
32
122
32
125
32
...
...
416
31
438
33
439
36
442
32
446
31
451
33
452
32
454
33
459
38
462
33
468
39
469
40
470
32
473
34
479
31
494
31
502
34
503
32
506
35
513
31
520
32
522
33
526
45
532
32
533
33
535
32
550
32
566
34
569
32
576
34
134 rows × 1 columns
Content source: kimkipyo/dss_git_kkp
Similar notebooks: